##########################################
### Create Datasets #################
##########################################

## Purpose: This script does additional cleaning
## and standardization of the variables (especially
## removing missing values, creating indices). It
## also merges in the geographic information for
## the Pew subset with MSA data. Third, it creates
## several plots demonstrating patterns of 
## missingness in the dataset. Finally, it constructs
## post-stratification weights.  
## Data In: 
## 1) data_files/data_merged_4_29_2025.rds
## 2) Geo-located Pew respondents:
## data_files/May_2025_LocalPew_GEO_AllYears.csv
## 3) U.S. census income quintiles over time:
## data_files/census_income_quintiles.xlsx
## 4) CPS Microdata for post-stratification weights:
## data_files/CPS_tables_formatted_12_3.xlsx
## Data Out: 
## 1) data_files/datafull_5_6_2025.rds
## 2) data_files/datamsa_5_6_2025.rds
## 3) data_files/cps_pop_estimates.rds
## 4) data_files/cps_pop_estimates_5_year_bin.rds
## 5) plots/AJS/survey_missingness_toinclude.pdf
## 6) plots/AJS/missingness_MSA_updated.pdf

## Software Versions: 

## R version 4.4.1
library(tidyverse) ## 2.0.0
library(openxlsx) ## 4.2.6.1
library(VGAM) ## 1.1.11
library(lubridate) ## 1.9.4

'%ni%' <- Negate('%in%')


## Set working directory
## to replication file

data <- readRDS("data_files/data_merged_4_29_2025.rds")
data.MSA <- read.csv("data_files/May_21_2025_LocalPew_GEO_AllYears.csv")

## income quantiles
income_quintiles <- read.xlsx("data_files/census_income_quintiles.xlsx", sheet = "current_formatted")

## cps microdata for post-stratification weights
cps <- read.xlsx("data_files/CPS_tables_formatted_12_3.xlsx")

'%ni%' <- Negate("%in%")


#######################################
## Additional Cleaning for Models #####
#######################################

## Pew combines don't know refused, Harris doesn't
## in order to create consistency, we drop not sure, 
## refused, and don't know/refused across all surveys
## for main variables of interest

data$inequality.bin <- dplyr::recode(data$inequality_harm,
                                     `Feel` = "Yes",
                                     `Don't Feel` = "No")
data$inequality.bin <- na_if(data$inequality.bin, "Not Sure")
data$inequality.bin <- na_if(data$inequality.bin, "Refused")
data$inequality.bin <- na_if(data$inequality.bin, "DK")
table(data$inequality_harm,
      data$inequality.bin)
data$inequality.bin <- factor(data$inequality.bin)

## creating numeric version 
data$inequality.num <- dplyr::recode(data$inequality.bin,
                                     `Yes` = 1,
                                     `No` = 0)



## union
## put no answer/refused,
## not sure, don't know/refused - all in NA category
unique(data$union.other)
unique(data$union.self)
data$union.other <- na_if(data$union.other,"Don't know/Refused")
data$union.other <- na_if(data$union.other,"No answer/refused")
data$union.other <- na_if(data$union.other, "Not Sure")
data$union.other <- dplyr::recode(data$union.other,
                                  `No, no union member in hh` = "No",
                                  `Yes, self and/or spouse is union member` = "Yes")
data$union.other <- factor(data$union.other,
                           levels = c("No", "Yes"),
                           labels = c("No", "Yes"))

data$union.self <- na_if(data$union.self,"Don't know/Refused")
data$union.self <- na_if(data$union.self,"No answer/refused")
data$union.self <- na_if(data$union.self, "Not Sure")
data$union.self <- dplyr::recode(data$union.self,
                                 `No, no union member in hh` = "No",
                                 `Yes, self and/or spouse is union member` = "Yes")
data$union.self <- factor(data$union.self,
                          levels = c("No", "Yes"),
                          labels = c("No", "Yes"))

## combine union.self with union.other
## to create single "union" variable that captures
## whether self or member of household is a union member
## coding it in this way means that for surveys that only asked about
## self not directly comparable, but Pew survey combined
## these questions (self or other)

## composite (account for NAs):
data$union <- data$union.self
data$union[!is.na(data$union.other) & 
             data$union.other == "Yes"] <- "Yes"
table(data$union)

## Race                                        
data$race2 <- na_if(data$race2, "Not sure/refused")
data$race2 <- factor(data$race2, 
                     levels = c("No", "Yes"),
                     labels = c("No", "Yes"),
                     ordered = FALSE)

## Party
## Pew has don't know/refused, independent, no preference, other
## 1987 these are combined (indicated by a separate category)
## Harris has not sure, refused, other  

## party.harm - comparable across all surveys 
## drops not sure, don't know, refused across all waves,
## as well as special category "Indep/No Pref/Other/Don't Know (1987 only)"
## the 1987 wave combined the independent with not sure 
## so that data is not strictly comparable 
## we also keep original "party" for diving into pew data more 

data$party.harm <- data$party
data$party.harm <- na_if(data$party.harm,"Independent/Other/Don't Know/Refused")
data$party.harm <- na_if(data$party.harm,"DK/Ref")
data$party.harm <- na_if(data$party.harm,"Not Sure")  
data$party.harm <- na_if(data$party.harm,"Refused") 

data$party.harm <- dplyr::recode(data$party.harm,
                                 `Independent` = "Indep/Other",
                                 `No preference` = "Indep/Other",
                                 `Other` = "Indep/Other")

data$party.harm <- factor(data$party.harm,
                          levels = c("Republican", "Democrat",
                                     "Indep/Other"),
                          labels = c("Republican" , "Democrat",
                                     "Indep/Other"))


# Education
unique(data$educ)
data$educ <- na_if(data$educ,"No answer/refused")
data$educ <- na_if(data$educ, "DK/Ref")
data$educ <- na_if(data$educ, "Not sure")                                 
data$educ <- na_if(data$educ, "Refused")

data$educ <- dplyr::recode(data$educ,
                           `Less than high` = "Less than high school")

data$educ <- factor(data$educ,
                    levels = c("Less than high school",
                               "High school graduate",
                               "Some college",
                               "College graduate",
                               "Post graduate"),
                    labels = c("Less than high school",
                               "High school graduate",
                               "Some college",
                               "College graduate",
                               "Post graduate"),
                    ordered = TRUE)


## creating collapsed version
data$educ_collapse <- dplyr::recode(data$educ,
                                    `Less than high school` = "HS or less",
                                    `High school graduate` = "HS or less",
                                    `College graduate` = "College graduate or greater",
                                    `Post graduate` = "College graduate or greater")

# Cohort - replicating Hout and Fischer 2014 cohorts
data <- data %>% dplyr::mutate(cohort = case_when(birth < 1900 ~ "Before 1900",
                                                  birth >= 1900 & birth < 1915 ~ "1900-1914",
                                                  birth >= 1915 & birth < 1925 ~ "1915-1924",
                                                  birth >= 1925 & birth < 1935 ~ "1925-1934",
                                                  birth >= 1935 & birth < 1945 ~ "1935-1944",
                                                  birth >= 1945 & birth < 1955 ~ "1945-1954",
                                                  birth >= 1955 & birth < 1965 ~ "1955-1964",
                                                  birth >= 1965 & birth < 1975 ~ "1965-1974",
                                                  birth >= 1975 & birth < 1985 ~ "1975-1984",
                                                  birth >= 1985 & birth <= 1995 ~ "1985-1995"))



## cohort choosing "1945-1954" as base level
data$cohort <- factor(data$cohort,
                      levels = c("1945-1954", "Before 1900", "1900-1914",
                                 "1915-1924", "1925-1934", "1935-1944",
                                 "1955-1964", "1965-1974", "1975-1984",
                                 "1985-1995"),
                      labels =  c("1945-1954", "Before 1900", "1900-1914",
                                  "1915-1924", "1925-1934", "1935-1944",
                                  "1955-1964", "1965-1974", "1975-1984",
                                  "1985-1995"))

## Cohort measure with categorical groupings
data <- data %>% mutate(cohort2 = case_when(birth < 1915 ~ "Before Greatest",
                                                      birth >=1915 & birth <1926 ~ "Greatest",
                                                      birth >= 1926 & birth <1945 ~ "Silent",
                                                      birth >= 1945 & birth <1965 ~ "Boomers",
                                                      birth >= 1965 & birth <1981 ~ "Gen-X",
                                                      birth >= 1981 ~ "Millenials"))


data$cohort2 <- factor(data$cohort2,
                            levels = c("Boomers", "Before Greatest",
                                       "Greatest",
                                       "Silent", "Gen-X", "Millenials"),
                            labels = c("Boomers", "Before Greatest",
                                       "Greatest",
                                       "Silent", "Gen-X", "Millenials"))



## Study
data$study <- as.character(data$study)

### racial_discrimination
data$racial_discrimination <- dplyr::recode(data$racial_discrimination,
                                            `Mostly agree` = "Feel",
                                            `Mostly disagree` = "Don't Feel",
                                            `Completely agree` = "Feel",
                                            `Completely disagree` = "Don't Feel")
data$racial_discrimination <- na_if(data$racial_discrimination, "DK")

## racial_gains
data$racial_gains <- dplyr::recode(data$racial_gains,
                                   `Mostly agree` = "Feel",
                                   `Mostly disagree` = "Don't Feel",
                                   `Completely agree` = "Feel",
                                   `Completely disagree` = "Don't Feel")
data$racial_gains <- na_if(data$racial_gains, "DK")

## gender
data$gender <- dplyr::recode(data$gender,
                             `FEMALE` = "female",
                             `Female` = "female",
                             `MALE` = "male",
                             `Male` = "male")



## including survey year in buckets of 5 
data$year_5 <- ifelse(data$year <= 1970,
                           "1966-1970",
                           ifelse(data$year <= 1975,
                                  "1971-1975",
                                  ifelse(data$year <= 1980,
                                         "1976-1980",
                                         ifelse(data$year <= 1985,
                                                "1981-1985",
                                                ifelse(data$year <= 1990,
                                                       "1986-1990",
                                                       ifelse(data$year <= 1995,
                                                              "1991-1995",
                                                              ifelse(data$year <= 2000,
                                                                     "1996-2000",
                                                                     ifelse(data$year <= 2005,
                                                                            "2001-2005",
                                                                            ifelse(data$year <= 2010,
                                                                                   "2006-2010",
                                                                                   "2011-2013")))))))))


## unordered factor
data$year_5 <- factor(data$year_5)



##########################
## alienation index ######
##########################

## this measure we aim to capture how many respondent "yes"
## responses to alienation question 

## we only create the alienation index for surveys 
# which included all three questions asked 
## (5 surveys didn't ask all three questions)
## scale of 0-3 - get a point for each yes,
## larger values indicates greater "alienation"

## we have to calculate it separately for Harris and Pew because 
## question wording different, which necessitated taking the inverse for some
## questions 


## pew and harris studies
pew.studies <- names(table(data$study))[grepl("Pew", names(table(data$study)))]
harris.studies <- names(table(data$study))[names(table(data$study)) %ni% pew.studies]


## identifying studies that didn't ask three questions
alienation.check <- data %>% group_by(study) %>%
  dplyr::summarize(dontcare = sum(is.na(dontcare)) / length(dontcare),
                   dontcount = sum(is.na(dontcount)) / length(dontcount),
                   leftout = sum(is.na(leftout)) / length(leftout))

check <- base::apply(alienation.check[, c(2:4)], 1, function(x){
  test <- x > .99
  test <- sum(test) > 0
  return(test)
})
alienation.check <- alienation.check[check, ]

##  Harris; 2135 (asked two), 792801 (asked two), 
## 7684, 812027, 911108 (missing all three)
## Pew Studies also missing those because
## the information was included under a different name
## which we deal with below

remove.harris <- c("911108", "812027", "792801", "7684", "2135")
harris.study.rm <- harris.studies[! harris.studies %in% remove.harris]

# pew is more complicated because 
## "pew.govtcare" and "pew.finsat" require negatvie 
# (Don't Feel)
## to indicate alienation 

data$pew.govtcarereverse <- dplyr::recode(data$pew.govtcare,
                                          `Feel` = "Don't Feel",
                                          `Don't Feel` = "Feel")
data$pew.govtcarereverse <- na_if(data$pew.govtcarereverse, "DK")

data$pew.finsatreverse <- dplyr::recode(data$pew.finsat,
                                        `Feel` = "Don't Feel",
                                        `Don't Feel` = "Feel")
data$pew.finsatreverse <- na_if(data$pew.finsatreverse,
                                "DK")

## removing nas from other alienation index variables
data$pew.nosay <- na_if(data$pew.nosay, "DK")
data$dontcare <- na_if(data$dontcare, "Refused")
data$dontcare <- na_if(data$dontcare, "Not Sure")
data$dontcare <- dplyr::recode(data$dontcare,
                               `Not Feel` = "Don't Feel")

data$leftout <- na_if(data$leftout, "Refused")
data$leftout <- na_if(data$leftout, "Not Sure")
data$leftout <- dplyr::recode(data$leftout,
                              `Not Feel` = "Don't Feel")

data$dontcount <- na_if(data$dontcount, "Refused")
data$dontcount <- dplyr::recode(data$dontcount,
                                `Not Feel` = "Don't Feel")
data$dontcount <- na_if(data$dontcount, "Not Sure")

## function to create alienation index
## sums number of "feel" responses 
## does this by study and only for 
## studies which asked all three questions
## and for respondents which answered all three 
## questions

alienation <- function(pew1, pew2, pew3, harris1, harris2, harris3, study){
  if(study %in% pew.studies){
    values <- c(pew1, pew2, pew3)
    if(sum(is.na(values)) < 1){
      counter <- sum(values == "Feel")
    } else {
      counter <- NA
    }
  } else if(study %in% harris.study.rm){
    values <- c(harris1, harris2, harris3)
    if(sum(is.na(values)) < 1){
      counter <- sum(values == "Feel")
    } else {
      counter <- NA
    }
  }
  else{
    counter <- NA
  }
  return(counter)
}

## example:
alienation(harris1 = "Feel", harris2 = "Feel", harris3 = NA, 
           pew1 = NA, pew2 = NA, pew3 = "Feel", study = "PEW02")

## apply to harris: study, dontcare, dontcount, leftout
## apply to pew: study, pew.nosay, pew.govtcarereverse, pew.finsatreverse

study <- which(colnames(data) == "study")
dontcare <- which(colnames(data) == "dontcare")
dontcount <- which(colnames(data) == "dontcount")
leftout <- which(colnames(data) == "leftout")
pew.nosay <- which(colnames(data) == "pew.nosay")
pew.govtcarereverse <- which(colnames(data) == "pew.govtcarereverse")
pew.finsatreverse <- which(colnames(data) == "pew.finsatreverse")

data$alienation <- base::apply(data, 1, function(x){alienation(study = x[study], 
                                                               harris1 = x[dontcare],
                                                               harris2 = x[dontcount],
                                                               harris3 = x[leftout],
                                                               pew1 = x[pew.nosay],
                                                               pew2 = x[pew.govtcarereverse],
                                                               pew3 = x[pew.finsatreverse])})


data$alienation <- dplyr::recode(data$alienation,
                                 `0` = "Zero_Q",
                                 `1` = "One_Q",
                                 `2` = "Two_Q",
                                 `3` = "Three_Q")
data$alienation <- factor(data$alienation,
                          levels = c("Zero_Q",
                                     "One_Q",
                                     "Two_Q",
                                     "Three_Q"))


################################
### Income Quintile and SES ####
################################

## the income quintiles spreadsheet 
## is from the U.S. Census (see README)
## and includes information on the U.S.
## income quintiles over time.
## It includes the upper bound for quintiles 1-4 by year


## following footnotes on data downloaded from
## Census, we remove the extra entries
## from 2017 and 2013

income_quintiles <- income_quintiles %>%
  filter(Year != "2017 (40)" &
           Year != "2013 (39)")
income_quintiles$Year <- gsub("\\([0-9]{1,}\\)", "", income_quintiles$Year)
income_quintiles$Year <- as.numeric(income_quintiles$Year)

##note: we don't have observation for 1966
## so we use quintiles from 1967 for those observations
## (we have no observations from 1967)
income_quintiles$Year[income_quintiles$Year == 1967] <- 1966
colnames(income_quintiles)[1] <- "year"

data <- left_join(data,
                  income_quintiles,
                  by = "year")

## we use non-deflated estimates for income because
## census income quintiles were not deflated

data$income_quintile <- ifelse(data$income.final <= data$upper_1st,
                               "quintile_1",
                               ifelse(data$income.final <= data$upper_2nd,
                                      "quintile_2",
                                      ifelse(data$income.final <= data$upper_3rd,
                                             "quintile_3",
                                             ifelse(data$income.final <= data$upper_4th,
                                                    "quintile_4",
                                                    "quintile_5"))))


data$income_quintile <- factor(data$income_quintile,
                                    levels = c("quintile_1",
                                               "quintile_2",
                                               "quintile_3",
                                               "quintile_4",
                                               "quintile_5"),
                                    ordered = TRUE)

## creating SES index
## to equal weight education and income quintile:
## (5/3 * education + income_quintile) / 2
## creates scale from 1 -5 - include as factor variable

data$ses <- round((as.numeric(data$educ_collapse) * 5/3 + 
                          as.numeric(data$income_quintile)) / 2)
data$ses <- factor(data$ses)


##############################################
### Cleaning Pew 2002 Dataset ################
### Separately for MSA Analysis ##############
##############################################

## When we merged in the geographic MSA
## variables, we used the indivudual Pew data
## files (rather than the merged files). Using
## the individual files rather than merged files
## makes no difference except for 2002. 
## It seems the merged Pew dataset included a slightly
## different set of respondents than the individual
## survey year dataset. Here we clean that 2002
## dataset and below we merge in the MSA variables.
## We only use this 2002 dataset for the MSA-level
## analysis. For all other analyses for 2002 we use
## the merged data. 

## TO FINISH 

####################################
## merging MSA data with full data
#####################################


### step 1: standardize all local geographical variables
## by year - because they were calculated with different
## units of analysis - for some years we had MSAs, for some
## years we had area codes 

## income exposure
data.MSA <- data.MSA %>%
  group_by(syear) %>% 
  mutate(incexpose.std = (incexpose - mean(incexpose, na.rm = TRUE)) / sd(incexpose, na.rm = TRUE))

## poverty rate
data.MSA <- data.MSA %>%
  group_by(syear) %>% 
  mutate(povertyrate.std = (povertyrate - mean(povertyrate, na.rm = TRUE)) / sd(povertyrate, na.rm = TRUE))

## gini
data.MSA <- data.MSA %>%
  group_by(syear) %>% 
  mutate(gini.std = (gini - mean(gini, na.rm = TRUE)) / sd(gini, na.rm = TRUE))

## relative mean deprivaiton
data.MSA <- data.MSA %>%
  group_by(syear) %>% 
  mutate(rmd.std = (rmd - mean(rmd, na.rm = TRUE)) / sd(rmd, na.rm = TRUE))

## segregation
data.MSA <- data.MSA %>%
  group_by(syear) %>% 
  mutate(segregation.std = (segregation - mean(segregation, na.rm = TRUE)) / sd(segregation, na.rm = TRUE))

## wealth - median
data.MSA <- data.MSA %>%
  group_by(syear) %>% 
  mutate(iGINIhouseVALmed.std = (iGINIhouseVALmed - mean(iGINIhouseVALmed, na.rm = TRUE)) / sd(iGINIhouseVALmed, na.rm = TRUE))

## wealth - q4
data.MSA <- data.MSA %>%
  group_by(syear) %>% 
  mutate(iGINIhouseVALq4.std = (iGINIhouseVALq4 - mean(iGINIhouseVALq4, na.rm = TRUE)) / sd(iGINIhouseVALq4, na.rm = TRUE))


### merging MSA geographic variables
### with respondent information
## the the file data.MSA only has
## the local geogrpahic data

## challenge 1: Pew duplicated respid in a few surveys:
## 358 in 1987, 25 in 2003 
## challenge 2: 1989 is missing all geographic variables,
## as noted in the SI we also do not have geographic 
## information for all respondents even in studies/years
## where we have it for some

## this shown here:
data.MSA %>%
  dplyr::group_by(syear) %>%
  dplyr::summarize(n = n(),
                   respid = length(unique(respid)),
                   gini = sum(!is.na(gini)),
                   rmd = sum(!is.na(rmd)),
                   inc = sum(!is.na(incexpose.std)),
                   pov = sum(!is.na(povertyrate.std)),
                   segregation = sum(!is.na(segregation.std)))


## as a result, for 1987, 2003 we join on gender,
## state, and respondent ID
## for other years just respondent ID
## harmonizing variable values
## for merging: 
data.MSA$sex <- dplyr::recode(data.MSA$sex,
                              `Female` = "female",
                              `Male` = "male")
data.MSA$state <- dplyr::recode(data.MSA$state,
                                `Washington D.C.` = 
                                  "Washington DC")
data.MSA$age <- dplyr::recode(data.MSA$age,
                              `Don't know/Refused` = "DK/Ref.")

## data to merge into
data_all_MSA <- data %>%
  filter(study %in% c("Pew1987",
                      "Pew1988",
                      "Pew1989",
                      "Pew1992",
                      "Pew1997",
                      "Pew1999",
                      "Pew2003",
                      "Pew2007",
                      "Pew2009",
                      "Pew2012"))


## checking missingness of merging variables
## for 1987, 2003: 0 
data.MSA %>%
  filter(syear %in% c(1987, 2003)) %>%
  summarize(sex = sum(is.na(sex) |
                        sex == ""),
            respid = sum(is.na(respid) |
                           respid == ""),
            state = sum(is.na(state) |
                          state == ""))
data_all_MSA %>%
  filter(year %in% c(1987, 2003)) %>%
  summarize(gender_c = sum(is.na(gender) |
                        gender == ""),
            RESPID = sum(is.na(RESPID) |
                           RESPID == ""),
            state = sum(is.na(state) |
                          state == ""))

## for other years: 0 
data.MSA %>%
  filter(syear %ni% c(1987, 2003)) %>%
  summarize(respid_na = sum(is.na(respid)))
data_all_MSA %>%
  filter(year %ni% c(1987, 2003)) %>%
  summarize(respid_na = sum(is.na(RESPID)))


##create matching id
data.MSA$matching <- ifelse(data.MSA$syear %in%
                              c(1987, 2003),
                            paste(data.MSA$sex,
                                   data.MSA$respid,
                                   data.MSA$state,
                                  data.MSA$syear,
                                   sep = "_"),
                            paste(data.MSA$respid,
                                  data.MSA$syear,
                                  sep = "_"))
data_all_MSA$matching <-ifelse(data_all_MSA$year %in%
                                 c(1987, 2003),
                               paste(data_all_MSA$gender,
                                     data_all_MSA$RESPID,
                                     data_all_MSA$state,
                                     data_all_MSA$year,
                                     sep = "_"),
                               paste(data_all_MSA$RESPID,
                                     data_all_MSA$year,
                                     sep = "_"))

## we test whether
## merged correctly by comparing
## age (variable available in both 
## dataset but not used to merge)
data_all_MSA$age_merge <- data.MSA$age[match(data_all_MSA$matching,
                                             data.MSA$matching)]
data_all_MSA %>%
  group_by(year) %>%
  summarize(n = n(),
            check = sum(age != age_merge,
                        na.rm = TRUE))

## checking whether merge variable 
## exists in both datasets 
data_all_MSA %>%
  group_by(year) %>%
  summarize(match = sum(matching %ni% data.MSA$matching),
            n = n())
data.MSA %>%
  group_by(syear) %>%
  summarize(match = sum(matching %ni% data_all_MSA$matching),
            n = n())

data_all_MSA <- left_join(data_all_MSA,
                          data.MSA,
                          by = "matching")


data_all_MSA <- data_all_MSA %>%
  dplyr::select(inequality.num, year, year_5, cohort, cohort2, 
                party.harm, age2,
                educ, educ_collapse, income.final.deflated, income_quintile,
                race2, 
                gini, gini.std, iGINIhouseVALmed,iGINIhouseVALmed.std, iGINIhouseVALq4,
                iGINIhouseVALq4.std, incexpose, incexpose.std, povertyrate, povertyrate.std, rmd, 
                rmd.std, segregation, segregation.std,
                gender, birth, ses, alienation)

## remove 1989 values
## because no geographic data
data_all_MSA <- data_all_MSA %>%
  filter(year != 1989)

rm(data.MSA)

#######################################
###### Testing Missingness #############
#######################################


## checking how many surveys we lose with each variable:
var.check <- data %>% group_by(study) %>%
  dplyr::summarize(year = unique(year),
                   inequality = sum(is.na(inequality))/length(inequality),
                   inequality.bin = sum(is.na(inequality.bin))/length(inequality.bin),
                   inequality.num = sum(is.na(inequality.num))/length(inequality.num),
                   party.harm = sum(is.na(party.harm))/length(party.harm),
                   cohort = sum(is.na(cohort))/length(cohort),
                   age2 = sum(is.na(age2))/length(age2),
                   party = sum(is.na(party))/length(party),
                   educ = sum(is.na(educ))/length(educ),
                   gender = sum(is.na(gender))/length(gender),
                   income.final.deflated = sum(is.na(income.final.deflated))/length(income.final.deflated),
                   race2 = sum(is.na(race2))/length(race2),
                   alienation = sum(is.na(alienation))/length(alienation))

var.check2 <- gather(var.check,
                     key = "variable",
                     value = "percent",
                     -c(study, year))

var.check2$year <- as.Date(as.character(var.check2$year), 
                           format = "%Y")
var.check2$year <- year(var.check2$year)

var.check2$variable <- dplyr::recode(var.check2$variable,
                                     `age2` = "age",
                                     `educ` = "education",
                                     `income.final.deflated` = "income",
                                     `inequality.num` = "inequality",
                                     `party.harm` = "party harmonized",
                                     `race2` = "race white")

ggplot(var.check2[var.check2$variable != "inequality1" &
                    var.check2$variable != "inequality.bin" &
                    var.check2$variable != "party", ], aes(x = year, y = percent)) +
  geom_point(alpha = .5, color = "red") +
  facet_wrap(~ variable) +
  xlab(NULL) +
  ylab("Percent Missing Observations") +
  theme_bw() +
  scale_y_continuous(labels = scales::percent_format()) +
  theme(axis.text.x = element_text(angle = 45,
                                   margin = margin(t = 5)))
## figures/survey_missingness_toinclude.pdf
## 5x7


## checking missingness in MSA subset
#### look at patterns of missingness in the MSA sample
var.check <- data_all_MSA %>% group_by(year) %>%
  dplyr::summarize(inequality.num = sum(is.na(inequality.num))/length(inequality.num),
                   party.harm = sum(is.na(party.harm))/length(party.harm),
                   cohort = sum(is.na(cohort))/length(cohort),
                   age2 = sum(is.na(age2))/length(age2),
                   educ = sum(is.na(educ))/length(educ),
                   income.final.deflated = sum(is.na(income.final.deflated))/length(income.final.deflated),
                   race2 = sum(is.na(race2))/length(race2),
                   gini = sum(is.na(gini))/length(gini),
                   segregation = sum(is.na(segregation))/length(segregation),
                   incexpose = sum(is.na(incexpose))/length(incexpose),
                   iGINIhouseValmed = sum(is.na(iGINIhouseVALmed))/length(iGINIhouseVALmed),
                   iGINIhouseVALq4 = sum(is.na(iGINIhouseVALq4))/length(iGINIhouseVALq4),
                   povertyrate = sum(is.na(povertyrate))/length(povertyrate),
                   rmd = sum(is.na(rmd))/length(rmd),
                   gender_matching = sum(is.na(gender))/length(gender))


var.check2 <- tidyr::gather(var.check,
                            key = "variable",
                            value = "percent",
                            -c(study, year))

var.check2$Year <- as.Date(as.character(var.check2$year), 
                           format = "%Y")
var.check2$Year <- year(var.check2$Year)

var.check2$variable <- dplyr::recode(var.check2$variable,
                                     `age2` = "age",
                                     `educ` = "education",
                                     `income.final.deflated` = "income",
                                     `inequality.num` = "inequality",
                                     `party.harm` = "party harmonized",
                                     `race2` = "race white")

ggplot(var.check2, aes(x = year, y = percent)) +
  geom_point(alpha = .5, color = "red") +
  facet_wrap(~ variable) +
  xlab(NULL) +
  ylab("Percent Missing Observations") +
  theme_bw() +
  scale_y_continuous(limits = c(0, 1),
                     labels = scales::percent_format()) +
  theme(axis.text.x = element_text(angle = 45,
                                   margin = margin(t = 5)))
## figures/missingness_MSA_updated.pdf
## 5x7


##########################################
###### Creating Dataframes ###############
##########################################

### dropping obs. with missing
## values- 77,971 observations
## note: removing missing values and removes the ~445
## observations of individuals under the age of 18
## that we wanted to exclude 

data.full <- data[complete.cases(data[,c("inequality", "inequality.num", "inequality.bin", "year", "cohort", 
                                         "party.harm", "age2",
                                         "party", "educ", "income.final.deflated", "gender",
                                         "race2")]),]



data_all_MSA <- data_all_MSA[complete.cases(data_all_MSA[,c("inequality.num", "year", "cohort", 
                                                            "party.harm", "age2",
                                                            "educ", "income.final.deflated",
                                                            "race2", "gender",
                                                            "gini", "iGINIhouseVALmed",
                                                            "iGINIhouseVALq4", "incexpose",
                                                            "povertyrate", "rmd", "segregation")]),]

nrow(data) ## 115067 total observations
length(unique(data$study)) ## 71 surveys
sum(grepl("Pew", unique(data$study))) ## 15 pew

nrow(data.full) ## 77971
length(unique(data.full$study)) ## 62 surveys
sum(grepl("Pew", unique(data.full$study))) ## 14 pew

nrow(data_all_MSA) ## 8,528
length(unique(data_all_MSA$year)) ## 9 surveys
nrow(data[data$year %in% unique(data_all_MSA$year) &
            grepl("Pew", data$study), ]) ## out of 23,488


#######################################
## Calculating Weights for Harris Data (and Pew-compatible)
#######################################
#######################################

## we construct post-stratification weights for the analytic 
## sample (data.full)
## on age, sex, edu

## each row of cps is population counts broken by education level
## for a given year, gender, age group 
colnames(cps)[3:6] <- c("educ_less_HS",
                        "educ_HS",
                        "educ_some_college",
                        "educ_college+")

cps$year_5 <- ifelse(as.numeric(as.character(cps$Year)) <= 1965,
                     "1962-1965",
                     ifelse(
                    as.numeric(as.character(cps$Year)) <= 1970,
                     "1966-1970",
                     ifelse(as.numeric(as.character(cps$Year)) <= 1975,
                            "1971-1975",
                            ifelse(as.numeric(as.character(cps$Year))<= 1980,
                                   "1976-1980",
                                   ifelse(as.numeric(as.character(cps$Year)) <= 1985,
                                          "1981-1985",
                                          ifelse(as.numeric(as.character(cps$Year)) <= 1990,
                                                 "1986-1990",
                                                 ifelse(as.numeric(as.character(cps$Year)) <= 1995,
                                                        "1991-1995",
                                                        ifelse(as.numeric(as.character(cps$Year)) <= 2000,
                                                               "1996-2000",
                                                               ifelse(as.numeric(as.character(cps$Year)) <= 2005,
                                                                      "2001-2005",
                                                                      ifelse(as.numeric(as.character(cps$Year)) <= 2010,
                                                                             "2006-2010",
                                                                             ifelse(as.numeric(as.character(cps$Year)) <= 2013,
                                                                                    "2011-2013",
                                                                                    "2015-2020")))))))))))

## summing category totals by 5 years
## for alternative weights (used downstream)
cps_5 <- cps %>%
  group_by(year_5, group) %>%
  summarize(educ_less_HS = sum(educ_less_HS),
            educ_HS = sum(educ_HS),
            educ_some_college = sum(educ_some_college),
            `educ_college+` = sum(`educ_college+`))


## total across cells by year
cps_totals <- cps %>%
  dplyr::group_by(Year) %>%
  dplyr::summarize(total = sum(educ_less_HS,
                               educ_HS,
                               educ_some_college,
                               `educ_college+`))

cps_totals_5 <- cps_5 %>%
  dplyr::group_by(year_5) %>%
  dplyr::summarize(total_5 = sum(educ_less_HS,
                               educ_HS,
                               educ_some_college,
                               `educ_college+`))

## joining datasets
cps$Year <- factor(cps$Year)
cps_totals$Year <- factor(cps_totals$Year)
cps <- left_join(cps, cps_totals, by = "Year")

cps_5 <- left_join(cps_5, 
                   cps_totals_5, by = "year_5")


## creating population categories that match 
##the education breakdown
## in our dataset 
## calculating the percent of total
## population in education-gender-age group
## for each year

cps$HSgrad <- cps$educ_HS / cps$total
cps$somecollege <- cps$educ_some_college/cps$total
cps$collegemore <- cps$`educ_college+`/cps$total
cps$less_thanHS <- cps$educ_less_HS / cps$total

cps_5$HSgrad <- cps_5$educ_HS / cps_5$total_5
cps_5$somecollege <- cps_5$educ_some_college/cps_5$total_5
cps_5$collegemore <- cps_5$`educ_college+`/cps_5$total_5
cps_5$less_thanHS <- cps_5$educ_less_HS / cps_5$total_5

## category variable for cps - for matching  
cps$category <- paste(cps$Year, cps$group,
                      sep = "_")
cps_5$category <- paste(cps_5$year_5, cps_5$group,
                      sep = "_")

## selecting just category and cps estimated population
## proportion
## for that age-gender-education category
weights <- cps %>% dplyr::select(HSgrad, somecollege,
                                 collegemore, less_thanHS,
                                 category)
weights_5 <- cps_5 %>% dplyr::select(HSgrad, somecollege,
                                 collegemore, less_thanHS,
                                 category)
## create long version 
weights <- gather(weights, 
                  key = "Year_Gender_Age_Edu", 
                  value = "Population",
                  -category)
weights$category <- gsub(" ", "_", weights$category)

weights_5 <- gather(weights_5,
                    key = "Year_Gender_Age_Edu",
                    value = "Population",
                    -category, -year_5)
weights_5$category <- gsub(" ", "_", weights_5$category)



## creating final label we merge on 
weights$Year_Gender_Age_Edu <- paste(weights$category,
                                     weights$Year_Gender_Age_Edu,
                                     sep = "_")
weights_5$Year_Gender_Age_Edu <- paste(weights_5$category,
                                     weights_5$Year_Gender_Age_Edu,
                                     sep = "_")


## save file for use with counterfactuals
#saveRDS(weights, "data_files/cps_pop_estimates.rds")
#saveRDS(weights_5, "data_files/cps_pop_estimates_5_year_bin.rds")

## creating variables with same levels in survey data
## collapse college graduate and postgraduate
## into "collegemore", mimicking the cps categories
data.full$educ.weights <- dplyr::recode(data.full$educ,
                                        `Less than high school` = "less_thanHS",
                                        `High school graduate` = "HSgrad",
                                        `Some college` = "somecollege",
                                        `College graduate` = "collegemore",
                                        `Post graduate` = "collegemore")


data.full <- data.full %>% mutate(age.weights = case_when(age2 <= 24 ~ "18-24",
                                                          age2 > 24 & age2 <= 34 ~ "25-34",
                                                          age2 > 34 & age2 <= 54 ~ "35-54",
                                                          age2 > 54 ~ "55+"))



## creating mergin variable for weights
data.full$Year_Gender_Age_Edu <- paste(data.full$year, 
                                       data.full$gender, 
                                       data.full$age.weights, 
                                       data.full$educ.weights,
                                       sep = "_")
## check to make sure levels match
sum(data.full$Year_Gender_Age_Edu %ni% weights$Year_Gender_Age_Edu) ## 0


## create counts by year (summing across surveys
## from same year) for total respondents 
## in each age, gender, educ category

table <- data.full %>%
  dplyr::group_by(year, Year_Gender_Age_Edu) %>%
  dplyr::summarize(survey_n = n())

## calculate total n for each study
study_n <- table %>%
  dplyr::group_by(year) %>%
  dplyr::summarize(total = sum(survey_n))

table <- left_join(table, study_n, by = "year")


## join weights and table
table <- left_join(table, weights[, c("Year_Gender_Age_Edu",
                                      "Population")],
                   by = "Year_Gender_Age_Edu")

## observed is the observed percent in our survey for that category
table$observed <- table$survey_n/table$total

## Population is the observed percent in CPS for that category

## check: observed should sum to 1 by year, 
## Population should sum to 1 by year as well
check <- table %>% group_by(year) %>%
  dplyr::summarize(test_obs = sum(observed),
                   test_pop = sum(Population))
table(check$test_obs)
table(check$test_pop,
      check$year) ## three years (1966,2010, 2011)
## slightly off from 1


## creating survey weights - upweights category which were under observed 
## in data, downweights over observed categories
table$weight.fulldata <- table$Population/table$observed

## merging with data
data.full$weight.fulldata <- table$weight.fulldata[
  match(data.full$Year_Gender_Age_Edu,
        table$Year_Gender_Age_Edu)
]


## Checking to make sure sum of weights equal to number of observations in study 
check <- data.full %>% filter(!is.na(weight.fulldata)) %>%
  group_by(year) %>% 
  dplyr::summarize(weight.count = sum(weight.fulldata),
                   study.2 = length(study))

which(check$weight.count != check$study.2)
check[c(1,34,35),] ## same years as above
## very slight differences


# trim weights to .3 to 3 
data.full$weight.fulldata[!is.na(data.full$weight.fulldata) &
                            data.full$weight.fulldata > 3] <- 3
data.full$weight.fulldata[is.na(data.full$weight.fulldata) &
                            data.full$weight.fulldata < .3] <- .3


#saveRDS(data.full, "data_files/datafull_5_6_2025.rds")
#saveRDS(data_all_MSA, "data_files/datamsa_5_6_2025.rds")
